# call functions and config modules
exec(open("./functions/functions.py").read())
exec(open("./config/config.py").read())
# set style for notebook
_set_css_style('./www/custom.css')
# toggle code on/off button
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
# call data manipulation and plotting libraries
import json
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np
import warnings
import shap
import random
import plotly.graph_objects as go
from IPython.display import HTML
import plotly.io as pio; pio.renderers.default='notebook'
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as offline
from datetime import date, time, datetime, timedelta
from plydata import define, if_else
from numpy import mean, absolute
import statistics
# import librariers for machine learning
from statsmodels.formula.api import ols
import pycaret
import sklearn
import featuretools as ft
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from pylab import rcParams
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_predict
from pycaret.classification import *
# turn off notebook warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
# turn off scientific notation
pd.options.display.float_format = '{:.2f}'.format
The next non-bugfix release of Featuretools will not support Python 3.6 MLflow support for Python 3.6 is deprecated and will be dropped in an upcoming release. At that point, existing Python 3.6 workflows that use MLflow will continue to work without modification, but Python 3.6 users will no longer get access to the latest MLflow features and bugfixes. We recommend that you upgrade to Python 3.7 or newer.
Many people struggle to get loans due to insufficient or non-existent credit histories. And, unfortunately, this is often the most marginalised part of the population. The approach taken in this analysis takes a naive approach to understading consumer defaults due to the rather limited nature of the data. The data set used in this report includes consumer default instances, defined in the data as the TARGET variable. A default is defined to be the customer that has late payment more than 90 days on a given load or failed to repay the loan.
The borrower characteristics include quantative metrics such as income, credit amount, value of good purchased, days employed etc. It also include qualatative/catergorical meausures such as gender, education, home ownership, mobile ownership, number of children etc.
This analysis does not take into account financial inclusion for the unbanked population. In fronteir markets, its important to make sure that underserved populations has a positive loan experience and given everyone an fair assessment. This data does not take into account alternative data sources to predict customers repayment abilities.
Alternative data sources that could be incorporated into the model includes;
The feature set includes borrower measures of credit usage, income, annuities, value of good purchases as well as qualative features such as education type, home ownership, birthday, gender etc. The response variable is a borrow DEFAULT (TARGET = 1).
The target variable takes the value 1 if someone experiences payment difficulties or fails to repay a loan.
# read in the credit risk data set
credit_risk= pd.read_csv("./data/credit_risk_data.csv")
# reset index to ID
credit_risk = credit_risk.set_index('SK_ID_CURR')
# first look at the structure of the data
print('Data types in credit risk dataset as follows:' '\n')
credit_risk.info()
Data types in credit risk dataset as follows: <class 'pandas.core.frame.DataFrame'> Int64Index: 307511 entries, 100002 to 456255 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 TARGET 307511 non-null int64 1 NAME_CONTRACT_TYPE 307511 non-null object 2 CODE_GENDER 307511 non-null object 3 FLAG_OWN_CAR 307511 non-null object 4 FLAG_OWN_REALTY 307511 non-null object 5 CNT_CHILDREN 307511 non-null int64 6 AMT_INCOME_TOTAL 307511 non-null float64 7 AMT_CREDIT 307511 non-null float64 8 AMT_ANNUITY 307499 non-null float64 9 AMT_GOODS_PRICE 307233 non-null float64 10 NAME_INCOME_TYPE 307511 non-null object 11 NAME_EDUCATION_TYPE 307511 non-null object 12 NAME_FAMILY_STATUS 307511 non-null object 13 NAME_HOUSING_TYPE 307511 non-null object 14 DAYS_BIRTH 307511 non-null int64 15 DAYS_EMPLOYED 307511 non-null int64 16 FLAG_MOBIL 307511 non-null int64 17 FLAG_WORK_PHONE 307511 non-null int64 18 OCCUPATION_TYPE 211120 non-null object 19 REGION_RATING_CLIENT 307511 non-null int64 20 REGION_RATING_CLIENT_W_CITY 307511 non-null int64 dtypes: float64(4), int64(8), object(9) memory usage: 51.6+ MB
# explore data properties
credit_risk.describe(percentiles=[.001, .1, .9, .99])
| TARGET | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | DAYS_BIRTH | DAYS_EMPLOYED | FLAG_MOBIL | FLAG_WORK_PHONE | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 307511.00 | 307511.00 | 307511.00 | 307511.00 | 307499.00 | 307233.00 | 307511.00 | 307511.00 | 307511.00 | 307511.00 | 307511.00 | 307511.00 |
| mean | 0.08 | 0.42 | 168797.92 | 599026.00 | 27108.57 | 538396.21 | -16037.00 | 63815.05 | 1.00 | 0.20 | 2.05 | 2.03 |
| std | 0.27 | 0.72 | 237123.15 | 402490.78 | 14493.74 | 369446.46 | 4363.99 | 141275.77 | 0.00 | 0.40 | 0.51 | 0.50 |
| min | 0.00 | 0.00 | 25650.00 | 45000.00 | 1615.50 | 40500.00 | -25229.00 | -17912.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| 0.1% | 0.00 | 0.00 | 31500.00 | 47970.00 | 3932.98 | 45000.00 | -25005.00 | -14344.45 | 1.00 | 0.00 | 1.00 | 1.00 |
| 10% | 0.00 | 0.00 | 81000.00 | 180000.00 | 11074.50 | 180000.00 | -22181.00 | -4881.00 | 1.00 | 0.00 | 1.00 | 1.00 |
| 50% | 0.00 | 0.00 | 147150.00 | 513531.00 | 24903.00 | 450000.00 | -15750.00 | -1213.00 | 1.00 | 0.00 | 2.00 | 2.00 |
| 90% | 0.00 | 2.00 | 270000.00 | 1133748.00 | 45954.00 | 1093500.00 | -10284.00 | 365243.00 | 1.00 | 1.00 | 3.00 | 3.00 |
| 99% | 1.00 | 3.00 | 472500.00 | 1854000.00 | 70006.50 | 1800000.00 | -8263.00 | 365243.00 | 1.00 | 1.00 | 3.00 | 3.00 |
| max | 1.00 | 19.00 | 117000000.00 | 4050000.00 | 258025.50 | 4050000.00 | -7489.00 | 365243.00 | 1.00 | 1.00 | 3.00 | 3.00 |
print('Null values in dataset are as follows:' '\n')
null = credit_risk.isnull().sum()
print(null[null>0])
Null values in dataset are as follows: AMT_ANNUITY 12 AMT_GOODS_PRICE 278 OCCUPATION_TYPE 96391 dtype: int64
There are some missing values that need to be imputed or removed from the data, namely 'annuity ammount', 'occupation type' and 'amount good price'. There are also columns that dont contain useful information, such as ID.
colnames = list(credit_risk.columns.drop('TARGET')) # create list of predictor names
categorical_names = credit_risk.select_dtypes(include=[object, np.int64]).columns # create list of categorical names
continuous_names = credit_risk.select_dtypes(exclude=[object, np.int64]).columns # create list of continuous names
print("The categorical/discrete features include: ")
print(", ".join(list(map(str,categorical_names))))
print("\nThe continuous features include: ")
print(", ".join(list(map(str,continuous_names))))
The categorical/discrete features include: TARGET, NAME_CONTRACT_TYPE, CODE_GENDER, FLAG_OWN_CAR, FLAG_OWN_REALTY, CNT_CHILDREN, NAME_INCOME_TYPE, NAME_EDUCATION_TYPE, NAME_FAMILY_STATUS, NAME_HOUSING_TYPE, DAYS_BIRTH, DAYS_EMPLOYED, FLAG_MOBIL, FLAG_WORK_PHONE, OCCUPATION_TYPE, REGION_RATING_CLIENT, REGION_RATING_CLIENT_W_CITY The continuous features include: AMT_INCOME_TOTAL, AMT_CREDIT, AMT_ANNUITY, AMT_GOODS_PRICE
# define features (X) and target (y)
credit_risk['TARGET'] = credit_risk['TARGET'].astype(str)
X = credit_risk[colnames] # X value contains all the variables except labels
y = credit_risk['TARGET'] # these are the labe'
# create training test split data sets, with test size of 30% of the data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
credit_train = pd.DataFrame(X_train).assign(TARGET = y_train)
credit_test = pd.DataFrame(X_test).assign(TARGET = y_test)
# set up model configuration for Exploratory Data Analysis
s = setup(
fold=10, # 10 fold cross validation
data = credit_train, # training data
silent = True,
test_data = credit_test, # test data
target = 'TARGET',
session_id = 123)
# display basic logistic regression model on raw data
best = compare_models(include = ['lr'])
| Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|---|
| lr | Logistic Regression | 0.9115 | 0.5954 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.7310 |
style = """
<style>
div.output_area {
overflow-y: scroll;
}
div.output_area img {
max-width: unset;
}
</style>
"""
HTML(style)
# This function generates useful Exploratory Data Analysis
# exploring the relationship between features and with the target variable
eda(display_format = 'svg')
Shape of your Data Set loaded: (146213, 21)
############## C L A S S I F Y I N G V A R I A B L E S ####################
Classifying variables in data set...
20 Predictors classified...
1 variables removed since they were ID or low-information variables
################ Binary_Classification VISUALIZATION Started #####################
Total Number of Scatter Plots = 10
Time to run AutoViz (in seconds) = 193 ###################### VISUALIZATION Completed ########################
# compute correlation matrix of feature set
corr = credit_risk[[*continuous_names]].corr()
# heatmap of correlations
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
<AxesSubplot:>
The above chart shows that the continuous variables are relatively correlated, e.g. a high credit amount correlates with a high good price (correlation = ~0.9). This can be a problem in classification context; however less so in a machine learning context. The problem is that in practice, you need to explain the system’s behaviour, especially if it makes decisions. ML explainability is important so that intelligent technologies don’t inherit societal biases.
# define dataframe for box plot
plot_data = credit_risk[['TARGET', *continuous_names]]
plot_data = plot_data.melt(id_vars="TARGET",
var_name="name",
value_name="value")
fig = px.box(plot_data,
x="name",
y="value",
color="TARGET",
notched=True)
fig.update_layout(template= "simple_white",
title="Target vs continuous variable box plot")
# fig.show()